31. Quiz Solutions
Solutions
Quiz solution code
CSVs in Python
import unicodecsv
def read_csv(filename):
with open(filename, 'rb') as f:
reader = unicodecsv.DictReader(f)
return list(reader)
enrollments = read_csv('enrollments.csv')
daily_engagement = read_csv('daily_engagement.csv')
project_submissions = read_csv('project_submissions.csv')
Investigating the Data
len(enrollments)
unique_enrolled_students = set()
for enrollment in enrollments:
unique_enrolled_students.add(enrollment['account_key'])
len(unique_enrolled_students)
len(daily_engagement)
unique_engagement_students = set()
for engagement_record in daily_engagement:
unique_engagement_students.add(engagement_record['acct'])
len(unique_engagement_students)
len(project_submissions)
unique_project_submitters = set()
for submission in project_submissions:
unique_project_submitters.add(submission['account_key'])
len(unique_project_submitters)
Problems in the Data
for engagement_record in daily_engagement:
engagement_record['account_key'] = engagement_record['acct']
del[engagement_record['acct']]
Missing engagement records
for enrollment in enrollments:
student = enrollment['account_key']
if student not in unique_engagement_students:
print enrollment
break
Checking for more problem records
num_problem_students = 0
for enrollment in enrollments:
student = enrollment['account_key']
if (student not in unique_engagement_students and
enrollment['join_date'] != enrollment['cancel_date']):
print enrollment
num_problem_students += 1
num_problem_students
Refining the Question
paid_students = {}
for enrollment in non_udacity_enrollments:
if (not enrollment['is_canceled'] or
enrollment['days_to_cancel'] > 7):
account_key = enrollment['account_key']
enrollment_date = enrollment['join_date']
if (account_key not in paid_students or
enrollment_date > paid_students[account_key]):
paid_students[account_key] = enrollment_date
len(paid_students)
Note that if you switch the order of the second if statement like so
if (enrollment_date > paid_students[account_key] or
account_key not in paid_students)
you will most likely get an error. Why do you think that is?
Check out this Stackoverflow discussion to find out more:
http://stackoverflow.com/questions/13960657/does-python-evaluate-ifs-conditions-lazily
Getting Data from First Week
Getting Data from First Week
def within_one_week(join_date, engagement_date):
time_delta = engagement_date - join_date
return time_delta.days < 7
def remove_free_trial_cancels(data):
new_data = []
for data_point in data:
if data_point['account_key'] in paid_students:
new_data.append(data_point)
return new_data
paid_enrollments = remove_free_trial_cancels(non_udacity_enrollments)
paid_engagement = remove_free_trial_cancels(non_udacity_engagement)
paid_submissions = remove_free_trial_cancels(non_udacity_submissions)
print len(paid_enrollments)
print len(paid_engagement)
print len(paid_submissions)
paid_engagement_in_first_week = []
for engagement_record in paid_engagement:
account_key = engagement_record['account_key']
join_date = paid_students[account_key]
engagement_record_date = engagement_record['utc_date']
if within_one_week(join_date, engagement_record_date):
paid_engagement_in_first_week.append(engagement_record)
len(paid_engagement_in_first_week)
Debugging Data Analysis Code
Debugging Data Analysis Code
Here is the code Caroline shows in the solution video:
student_with_max_minutes = None
max_minutes = 0
for student, total_minutes in total_minutes_by_account.items():
if total_minutes > max_minutes:
max_minutes = total_minutes
student_with_max_minutes = student
max_minutes
for engagement_record in paid_engagement_in_first_week:
if engagement_record['account_key'] == student_with_max_minutes:
print engagement_record
Alternatively, you can find the account key with the maximum minutes using this shorthand notation:
max(total_minutes_by_account.items(), key=lambda pair: pair[1])
Fixing Bug in within_one_week()
She also updated the code for the within_one_week function to the following:
def within_one_week(join_date, engagement_date):
time_delta = engagement_date - join_date
return time_delta.days >= 0 and time_delta.days < 7
Lessons completed in first week
Lessons Completed in First Week
First, Caroline refactors the given code to analyze total minutes spent in the first week into the following:
from collections import defaultdict
def group_data(data, key_name):
grouped_data = defaultdict(list)
for data_point in data:
key = data_point[key_name]
grouped_data[key].append(data_point)
return grouped_data
engagement_by_account = group_data(paid_engagement_in_first_week,
'account_key')
def sum_grouped_items(grouped_data, field_name):
summed_data = {}
for key, data_points in grouped_data.items():
total = 0
for data_point in data_points:
total += data_point[field_name]
summed_data[key] = total
return summed_data
total_minutes_by_account = sum_grouped_items(engagement_by_account,
'total_minutes_visited')
import numpy as np
def describe_data(data):
print 'Mean:', np.mean(data)
print 'Standard deviation:', np.std(data)
print 'Minimum:', np.min(data)
print 'Maximum:', np.max(data)
describe_data(total_minutes_by_account.values())
Then she called the functions she created to analyze the lessons completed in the first week as follows:
lessons_completed_by_account = sum_grouped_items(engagement_by_account,
'lessons_completed')
describe_data(lessons_completed_by_account.values())
Number of Visits in the First Week
Number of Visits in the First Week
Here is the code Caroline shows in the solution video. First she ran this code to create the has_visited field:
for engagement_record in paid_engagement:
if engagement_record['num_courses_visited'] > 0:
engagement_record['has_visited'] = 1
else:
engagement_record['has_visited'] = 0
Then, after recreating the engagement_by_account dictionary with the updated data, she ran the following code to analyze days visited in the first week:
days_visited_by_account = sum_grouped_items(engagement_by_account,
'has_visited')
describe_data(days_visited_by_account.values())
Splitting out Passing Students
Splitting out Passing Students
Here is the code Caroline shows in the solution video:
subway_project_lesson_keys = ['746169184', '3176718735']
pass_subway_project = set()
for submission in paid_submissions:
project = submission['lesson_key']
rating = submission['assigned_rating']
if ((project in subway_project_lesson_keys) and
(rating == 'PASSED' or rating == 'DISTINCTION')):
pass_subway_project.add(submission['account_key'])
len(pass_subway_project)
passing_engagement = []
non_passing_engagement = []
for engagement_record in paid_engagement_in_first_week:
if engagement_record['account_key'] in pass_subway_project:
passing_engagement.append(engagement_record)
else:
non_passing_engagement.append(engagement_record)
print len(passing_engagement)
print len(non_passing_engagement)
Comparing the Two Student Groups
Comparing the Two Student Groups
Here is the code Caroline shows in the solution video:
passing_engagement_by_account = group_data(passing_engagement,
'account_key')
non_passing_engagement_by_account = group_data(non_passing_engagement,
'account_key')
print 'non-passing students:'
non_passing_minutes = sum_grouped_items(
non_passing_engagement_by_account,
'total_minutes_visited'
)
describe_data(non_passing_minutes.values())
print 'passing students:'
passing_minutes = sum_grouped_items(
passing_engagement_by_account,
'total_minutes_visited'
)
describe_data(passing_minutes.values())
print 'non-passing students:'
non_passing_lessons = sum_grouped_items(
non_passing_engagement_by_account,
'lessons_completed'
)
describe_data(non_passing_lessons.values())
print 'passing students:'
passing_lessons = sum_grouped_items(
passing_engagement_by_account,
'lessons_completed'
)
describe_data(passing_lessons.values())
print 'non-passing students:'
non_passing_visits = sum_grouped_items(
non_passing_engagement_by_account,
'has_visited'
)
describe_data(non_passing_visits.values())
print 'passing students:'
passing_visits = sum_grouped_items(
passing_engagement_by_account,
'has_visited'
)
describe_data(passing_visits.values())
Making Histograms
Making Histograms
Here is the code Caroline shows in the solution video:
%pylab inline
import matplotlib.pyplot as plt
import numpy as np
# Summarize the given data
def describe_data(data):
print 'Mean:', np.mean(data)
print 'Standard deviation:', np.std(data)
print 'Minimum:', np.min(data)
print 'Maximum:', np.max(data)
plt.hist(data)
Fixing the Number of Bins
To change how many bins are shown for each plot, try using the bins argument to the hist function. You can find documentation for the hist function and the arguments it takes here.
Improving Plots and Sharing Findings
Improving Plots and Sharing Findings
Here is the code Caroline shows in the solution video:
import seaborn as sns
plt.hist(non_passing_visits.values(), bins=8)
plt.xlabel('Number of days')
plt.title('Distribution of classroom visits in the first week ' +
'for students who do not pass the subway project')
plt.hist(passing_visits.values(), bins=8)
plt.xlabel('Number of days')
plt.title('Distribution of classroom visits in the first week ' +
'for students who pass the subway project')